PM566 Assignment 1

#Name:Liying Deng

library(data.table)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:data.table':

    between, first, last
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ ggplot2   3.5.1     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between()     masks data.table::between()
✖ dplyr::filter()      masks stats::filter()
✖ dplyr::first()       masks data.table::first()
✖ lubridate::hour()    masks data.table::hour()
✖ lubridate::isoweek() masks data.table::isoweek()
✖ dplyr::lag()         masks stats::lag()
✖ dplyr::last()        masks data.table::last()
✖ lubridate::mday()    masks data.table::mday()
✖ lubridate::minute()  masks data.table::minute()
✖ lubridate::month()   masks data.table::month()
✖ lubridate::quarter() masks data.table::quarter()
✖ lubridate::second()  masks data.table::second()
✖ purrr::transpose()   masks data.table::transpose()
✖ lubridate::wday()    masks data.table::wday()
✖ lubridate::week()    masks data.table::week()
✖ lubridate::yday()    masks data.table::yday()
✖ lubridate::year()    masks data.table::year()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(leaflet)

#Question 1

Dataset2022 <- data.table::fread("/Users/kristydeng/Downloads/2022 Dataset.csv")
Dataset2002 <-data.table::fread("/Users/kristydeng/Downloads/2002 Dataset.csv")

#Question 2

dataset2002 <- fread("/Users/kristydeng/Downloads/2002 Dataset.csv")  
dataset2022 <- fread("/Users/kristydeng/Downloads/2022 Dataset.csv")
dataset2002 <- dataset2002 %>%
  mutate(Year = 2002)
dataset2022 <- dataset2022 %>%
  mutate(Year = 2022)
colnames(dataset2002)
 [1] "Date"                           "Source"                        
 [3] "Site ID"                        "POC"                           
 [5] "Daily Mean PM2.5 Concentration" "Units"                         
 [7] "Daily AQI Value"                "Local Site Name"               
 [9] "Daily Obs Count"                "Percent Complete"              
[11] "AQS Parameter Code"             "AQS Parameter Description"     
[13] "Method Code"                    "Method Description"            
[15] "CBSA Code"                      "CBSA Name"                     
[17] "State FIPS Code"                "State"                         
[19] "County FIPS Code"               "County"                        
[21] "Site Latitude"                  "Site Longitude"                
[23] "Year"                          
colnames(dataset2022)
 [1] "Date"                           "Source"                        
 [3] "Site ID"                        "POC"                           
 [5] "Daily Mean PM2.5 Concentration" "Units"                         
 [7] "Daily AQI Value"                "Local Site Name"               
 [9] "Daily Obs Count"                "Percent Complete"              
[11] "AQS Parameter Code"             "AQS Parameter Description"     
[13] "Method Code"                    "Method Description"            
[15] "CBSA Code"                      "CBSA Name"                     
[17] "State FIPS Code"                "State"                         
[19] "County FIPS Code"               "County"                        
[21] "Site Latitude"                  "Site Longitude"                
[23] "Year"                          
combined_data <- bind_rows(dataset2002, dataset2022)
print(head(combined_data))
         Date Source  Site ID   POC Daily Mean PM2.5 Concentration    Units
       <char> <char>    <int> <int>                          <num>   <char>
1: 01/05/2002    AQS 60010007     1                           25.1 ug/m3 LC
2: 01/06/2002    AQS 60010007     1                           31.6 ug/m3 LC
3: 01/08/2002    AQS 60010007     1                           21.4 ug/m3 LC
4: 01/11/2002    AQS 60010007     1                           25.9 ug/m3 LC
5: 01/14/2002    AQS 60010007     1                           34.5 ug/m3 LC
6: 01/17/2002    AQS 60010007     1                           41.0 ug/m3 LC
   Daily AQI Value Local Site Name Daily Obs Count Percent Complete
             <int>          <char>           <int>            <num>
1:              81       Livermore               1              100
2:              93       Livermore               1              100
3:              74       Livermore               1              100
4:              82       Livermore               1              100
5:              98       Livermore               1              100
6:             115       Livermore               1              100
   AQS Parameter Code AQS Parameter Description Method Code
                <int>                    <char>       <int>
1:              88101  PM2.5 - Local Conditions         120
2:              88101  PM2.5 - Local Conditions         120
3:              88101  PM2.5 - Local Conditions         120
4:              88101  PM2.5 - Local Conditions         120
5:              88101  PM2.5 - Local Conditions         120
6:              88101  PM2.5 - Local Conditions         120
                      Method Description CBSA Code
                                  <char>     <int>
1: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
2: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
3: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
4: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
5: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
6: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
                           CBSA Name State FIPS Code      State
                              <char>           <int>     <char>
1: San Francisco-Oakland-Hayward, CA               6 California
2: San Francisco-Oakland-Hayward, CA               6 California
3: San Francisco-Oakland-Hayward, CA               6 California
4: San Francisco-Oakland-Hayward, CA               6 California
5: San Francisco-Oakland-Hayward, CA               6 California
6: San Francisco-Oakland-Hayward, CA               6 California
   County FIPS Code  County Site Latitude Site Longitude  Year
              <int>  <char>         <num>          <num> <num>
1:                1 Alameda      37.68753      -121.7842  2002
2:                1 Alameda      37.68753      -121.7842  2002
3:                1 Alameda      37.68753      -121.7842  2002
4:                1 Alameda      37.68753      -121.7842  2002
5:                1 Alameda      37.68753      -121.7842  2002
6:                1 Alameda      37.68753      -121.7842  2002
setnames(combined_data, old = "Daily Mean PM2.5 Concentration", new = "PM2.5")
setnames(combined_data, old = "Site Latitude", new = "Latitude")
setnames(combined_data, old = "Site Longitude", new = "Longitude")
setnames(combined_data, old = "Local Site Name", new = "Site")
setnames(combined_data, old = "Daily Obs Count", new = "Obs.Count")
setnames(combined_data, old = "Daily AQI Value", new = "AQI")

#Question 3

unique_years <- unique(combined_data$Year)
print(unique_years)
[1] 2002 2022
pal <- colorFactor(
  palette = c("red", "blue"),  
  domain = unique_years 
)
leaflet(combined_data) %>%
  addTiles() %>%
  addCircleMarkers(
    lng = ~Longitude, lat = ~Latitude,
    color = ~pal(Year), 
    popup = ~Site,  
    label = ~paste("Year:", Year, "<br>Site:", Site),  
    radius = 6, 
    stroke = FALSE, fillOpacity = 0.8  
  ) %>%
  addLegend(
    position = "bottomright",
    pal = pal,
    values = ~Year,
    title = "Year",
    opacity = 1
  )

#Question 4

summary_stats <- combined_data %>%
  summarise(
    total_observations = n(),
    missing_values = sum(is.na(PM2.5)),
    missing_proportion = mean(is.na(PM2.5)),
    min_PM2.5 = min(PM2.5, na.rm = TRUE),
    max_PM2.5 = max(PM2.5, na.rm = TRUE),
    outliers = sum(PM2.5 < 0 | PM2.5 > 400, na.rm = TRUE), 
    outliers_proportion = mean(PM2.5 < 0 | PM2.5 > 400, na.rm = TRUE)
  )
(summary_stats)
  total_observations missing_values missing_proportion min_PM2.5 max_PM2.5
1              75732              0                  0      -6.7     302.5
  outliers outliers_proportion
1      215         0.002838958
cleaned_data <- combined_data %>%
  filter(PM2.5 >= 0 & PM2.5 <= 400)
(head(cleaned_data))
         Date Source  Site ID   POC PM2.5    Units   AQI      Site Obs.Count
       <char> <char>    <int> <int> <num>   <char> <int>    <char>     <int>
1: 01/05/2002    AQS 60010007     1  25.1 ug/m3 LC    81 Livermore         1
2: 01/06/2002    AQS 60010007     1  31.6 ug/m3 LC    93 Livermore         1
3: 01/08/2002    AQS 60010007     1  21.4 ug/m3 LC    74 Livermore         1
4: 01/11/2002    AQS 60010007     1  25.9 ug/m3 LC    82 Livermore         1
5: 01/14/2002    AQS 60010007     1  34.5 ug/m3 LC    98 Livermore         1
6: 01/17/2002    AQS 60010007     1  41.0 ug/m3 LC   115 Livermore         1
   Percent Complete AQS Parameter Code AQS Parameter Description Method Code
              <num>              <int>                    <char>       <int>
1:              100              88101  PM2.5 - Local Conditions         120
2:              100              88101  PM2.5 - Local Conditions         120
3:              100              88101  PM2.5 - Local Conditions         120
4:              100              88101  PM2.5 - Local Conditions         120
5:              100              88101  PM2.5 - Local Conditions         120
6:              100              88101  PM2.5 - Local Conditions         120
                      Method Description CBSA Code
                                  <char>     <int>
1: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
2: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
3: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
4: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
5: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
6: Andersen RAAS2.5-300 PM2.5 SEQ w/WINS     41860
                           CBSA Name State FIPS Code      State
                              <char>           <int>     <char>
1: San Francisco-Oakland-Hayward, CA               6 California
2: San Francisco-Oakland-Hayward, CA               6 California
3: San Francisco-Oakland-Hayward, CA               6 California
4: San Francisco-Oakland-Hayward, CA               6 California
5: San Francisco-Oakland-Hayward, CA               6 California
6: San Francisco-Oakland-Hayward, CA               6 California
   County FIPS Code  County Latitude Longitude  Year
              <int>  <char>    <num>     <num> <num>
1:                1 Alameda 37.68753 -121.7842  2002
2:                1 Alameda 37.68753 -121.7842  2002
3:                1 Alameda 37.68753 -121.7842  2002
4:                1 Alameda 37.68753 -121.7842  2002
5:                1 Alameda 37.68753 -121.7842  2002
6:                1 Alameda 37.68753 -121.7842  2002

#Question 5

state_summary <- combined_data %>%
  group_by(State) %>%
  summarise(
    mean_PM2.5 = mean(PM2.5, na.rm = TRUE),
    median_PM2.5 = median(PM2.5, na.rm = TRUE),
    sd_PM2.5 = sd(PM2.5, na.rm = TRUE),
    count = n()
  )
(state_summary)
# A tibble: 1 × 5
  State      mean_PM2.5 median_PM2.5 sd_PM2.5 count
  <chr>           <dbl>        <dbl>    <dbl> <int>
1 California       10.1          7.6     9.82 75732
ggplot(combined_data, aes(x = State, y = PM2.5, fill = State)) +
  geom_boxplot() +
  labs(title = "PM2.5 Levels by State", x = "State", y = "PM2.5") +
  theme_minimal()

county_summary <- combined_data %>%
  group_by(County) %>%
  summarise(
    mean_PM2.5 = mean(PM2.5, na.rm = TRUE),
    median_PM2.5 = median(PM2.5, na.rm = TRUE),
    sd_PM2.5 = sd(PM2.5, na.rm = TRUE),
    count = n()
  )
(county_summary)
# A tibble: 51 × 5
   County       mean_PM2.5 median_PM2.5 sd_PM2.5 count
   <chr>             <dbl>        <dbl>    <dbl> <int>
 1 Alameda            8.81         7.2      6.21  1994
 2 Butte              8.73         6        8.90  1594
 3 Calaveras          6.60         5.3      4.71   415
 4 Colusa             8.40         7        6.32   496
 5 Contra Costa       9.98         7.8      8.93  1093
 6 Del Norte          4.75         4.05     3.43   568
 7 El Dorado          4.47         3.1      7.21   436
 8 Fresno            12.3          8.4     12.1   3521
 9 Glenn              5.34         4.4      4.98   351
10 Humboldt           7.11         6        4.45   175
# ℹ 41 more rows
ggplot(combined_data, aes(x = PM2.5, fill = County)) +
  geom_histogram(bins = 20, alpha = 0.6, position = "identity") +
  facet_wrap(~ County) +
  labs(title = "Distribution of PM2.5 Levels by County", x = "PM2.5", y = "Frequency") +
  theme_minimal()

la_sites <- combined_data %>%
  filter(County == "Los Angeles")
site_summary <- la_sites %>%
  group_by(Site) %>%
  summarise(
    mean_PM2.5 = mean(PM2.5, na.rm = TRUE),
    median_PM2.5 = median(PM2.5, na.rm = TRUE),
    sd_PM2.5 = sd(PM2.5, na.rm = TRUE),
    count = n()
  )
(site_summary)
# A tibble: 18 × 5
   Site                             mean_PM2.5 median_PM2.5 sd_PM2.5 count
   <chr>                                 <dbl>        <dbl>    <dbl> <int>
 1 ""                                    23.9          21.4    12.3    118
 2 "Azusa"                               18.7          16.7    11.9    415
 3 "Burbank"                             24.0          21.6    12.7    122
 4 "Compton"                             13.0          11.9     6.22   723
 5 "Glendora"                             8.42          7.8     5.47   365
 6 "Lancaster-Division Street"            8.19          7.7     3.29   455
 7 "Lebec"                                4.46          4.2     2.58   150
 8 "Long Beach (North)"                  18.2          15.5    10.3    411
 9 "Long Beach (South)"                  12.0          11.3     4.32   239
10 "Long Beach-Route 710 Near Road"      13.0          12.1     5.61   574
11 "Los Angeles-North Main Street"       14.6          12.4     8.72  1271
12 "Lynwood"                             23.3          19.8    12.0    122
13 "North Hollywood (NOHO)"              13.0          13.1     4.75   365
14 "Pasadena"                            14.7          11.8    10.0    241
15 "Pico Rivera #2"                      11.4          10.0     5.93   118
16 "Reseda"                              12.3          11       7.05   602
17 "Santa Clarita"                        9.14          8.5     3.90   365
18 "Signal Hill (LBSH)"                   8.85          8.5     4.42   293
ggplot(la_sites, aes(x = Date, y = PM2.5, color = Site)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  labs(title = "PM2.5 Levels Over Time by Site in Los Angeles", x = "Date", y = "PM2.5 ") +
  theme_minimal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.